In [1]:
import requests as r
import re
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import seaborn as sns
import time
import matplotlib.pyplot as plt
%matplotlib inline
sns.set()
import warnings
warnings.simplefilter('ignore') #ignore the warnings, not the errors

WHY

image.png

ChessUrl

image.png

spoiler

image.png

Real why

image.png

image.png

ChessUrl

Data

image.png

In [4]:
df=pd.read_csv('NUMeconomic_freedom_index2019_exploratory.csv')
df
Out[4]:
CountryID Country Name WEBNAME Region World Rank Region Rank 2019 Score Property Rights Judical Effectiveness Government Integrity ... Country Population (Millions) GDP (Billions, PPP) GDP Growth Rate (%) 5 Year GDP Growth Rate (%) GDP per Capita (PPP) Unemployment (%) Inflation (%) FDI Inflow (Millions) Public Debt (% of GDP)

0 rows × 34 columns

Blue: Developed countries

image.png

In [11]:
pd.read_html('https://fr.wikipedia.org/wiki/Pays_d%C3%A9velopp%C3%A9#cite_note-1')[0].iloc[:,0]
Out[11]:
0             États-Unis
1                 Canada
2                   Asie
3              Singapour
4       Hong Kong, Chine
5                  Japon
6           Corée du Sud
7                 Israël
8                 Taïwan
9                 Europe
10               Norvège
11                Suisse
12             Allemagne
13              Danemark
14              Pays-Bas
15               Irlande
16              Finlande
17                 Suède
18         Liechtenstein
19           Royaume-Uni
20            Luxembourg
21                France
22              Belgique
23              Autriche
24              Slovénie
25                Italie
26               Espagne
27    République tchèque
28                 Grèce
29               Estonie
30               Andorre
31                Chypre
32                 Malte
33             Slovaquie
34              Portugal
35               Océanie
36             Australie
37      Nouvelle-Zélande
Name: (Pays, Amérique), dtype: object
In [12]:
developed="""United States,
                 Canada,
               Singapore,
        Hong Kong,
                   Japan,
            South Corea,
                  Israel,
                  Taiwan,
                Norway,
                 Switzerland,
              Germany,
               Denmark,
               Netherlands,
                Ireland,
               Finland,
                  Sweden,
          Liechtenstein,
            United Kingdom,
             Luxembourg,
                 France,
               Belgium,
               Austria,
               Slovenia,
                Italy,
                Spain,
     Czech Republic,
                  Greece,
                Estonia,
                Andorra,
                 Cyprus,
                  Malta,
              Slovakia,
               Portugal,
                Australia,
       New Zealand"""
In [13]:
developed_list=[i.strip().replace(',','') for i in developed.split('\n')]
In [14]:
df['Developed']=df['Country Name'].isin(developed_list)
In [15]:
df["Developed"]=np.where(df["Country Name"]=='Korea, South',1,df["Developed"])
In [16]:
df["Developed"] = df["Developed"].astype(int)
In [17]:
# no andorra
# thanks Tiago
In [18]:
df
Out[18]:
CountryID Country Name WEBNAME Region World Rank Region Rank 2019 Score Property Rights Judical Effectiveness Government Integrity ... Population (Millions) GDP (Billions, PPP) GDP Growth Rate (%) 5 Year GDP Growth Rate (%) GDP per Capita (PPP) Unemployment (%) Inflation (%) FDI Inflow (Millions) Public Debt (% of GDP) Developed
0 1 Afghanistan Afghanistan Asia-Pacific 152.0 39.0 51.5 19.6 29.6 25.2 ... 35.5 69.6 2.5 2.9 1958.0 8.8 5.0 53.9 7.3 0
1 2 Albania Albania Europe 52.0 27.0 66.5 54.8 30.6 40.4 ... 2.9 36.0 3.9 2.5 12507.0 13.9 2.0 1119.1 71.2 0
2 3 Algeria Algeria Middle East and North Africa 171.0 14.0 46.2 31.6 36.2 28.9 ... 41.5 632.9 2.0 3.1 15237.0 10.0 5.6 1203.0 25.8 0
3 4 Angola Angola Sub-Saharan Africa 156.0 33.0 50.6 35.9 26.6 20.5 ... 28.2 190.3 0.7 2.9 6753.0 8.2 31.7 -2254.5 65.3 0
4 5 Argentina Argentina Americas 148.0 26.0 52.2 47.8 44.5 33.5 ... 44.1 920.2 2.9 0.7 20876.0 8.7 25.7 11857.0 52.6 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
181 179 Venezuela Venezuela Americas 179.0 32.0 25.9 7.6 13.1 7.9 ... 31.4 380.7 -14.0 -7.8 12114.0 7.7 1087.5 -68.0 34.9 0
182 180 Vietnam Vietnam Asia-Pacific 128.0 30.0 55.3 49.8 40.3 34.0 ... 93.6 647.4 6.8 6.2 6913.0 2.1 3.5 14100.0 58.2 0
183 181 Yemen Yemen Middle East and North Africa NaN NaN NaN 19.6 22.2 20.3 ... 30.0 38.6 -13.8 -16.1 1287.0 14.0 4.9 -269.9 141.0 0
184 182 Zambia Zambia Sub-Saharan Africa 138.0 27.0 53.6 45.0 35.6 32.3 ... 17.2 68.9 3.6 4.0 3996.0 7.8 6.6 1091.2 62.2 0
185 183 Zimbabwe Zimbabwe Sub-Saharan Africa 175.0 45.0 40.4 29.7 24.8 15.8 ... 14.9 34.0 3.0 2.6 2283.0 5.0 1.3 289.4 78.4 0

186 rows × 35 columns

Insights

ChessUrl

In [19]:
import plotly.express as px

fig = px.scatter(y=df['2019 Score'], x=df['GDP per Capita (PPP)'], color=df.Developed, hover_name=df.Country, labels={"x": "GDP per Capita (PPP), $",  "y": "Economic Freedom Score"})
fig.update_layout(hovermode="x")

fig.show()

image.png

GDP per capita prediction

ChessUrl

DecisionTree (GDP per capita)

image.png


image.png

image.png

weighted values

RandomForest (GDP per capita)

image.png

image.png

LinearRegression (GDP per capita)

image.png

image.png

image.png

GDP growth (5 years)

ChessUrl

DecisionTree (GDP growth (5 years)

image.png

image.png

image.png

RandomForest (GDP growth (5 years)

image.png

image.png

Linear Regression (GDP growth (5 years)

image.png


image.png

image.png

Developed or not

ChessUrl

Decision tree

image.png


image.png


image-2.png

  • I joined 2 dataframes but look how striking is the difference from real values and columns from that index

RandomForest

image.png

image.png

After all that..

image.png

Models

ChessUrl

Dropping columns

In [23]:
cols_to_drop=[]
cols_to_drop.append('CountryID')
cols_to_drop.append('WEBNAME')
cols_to_drop.append('Region')
cols_to_drop.append('World Rank')
cols_to_drop.append('Region Rank')
cols_to_drop.append('Country')
cols_to_drop.append('2019 Score') #Not sure drop it or not
cols_to_drop.append('Country Name')
In [24]:
df.fillna(df.mean(),inplace=True)

Models

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, f1_score, precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
In [26]:
X_train, X_test,y_train,y_test = train_test_split(df.drop('Developed',axis=1), \
                                                  df.Developed, \
                                                  test_size=1/3, \
                                                  random_state=42)
cool models

ChessUrl

In [27]:
# decision tree

model1=DecisionTreeClassifier()
model1.fit(X_train, y_train)
y_pred1=model1.predict(X_test)
conf1=confusion_matrix(y_test, y_pred1)
acc1=accuracy_score(y_test, y_pred1)
rec1=recall_score(y_test, y_pred1)
pre1=precision_score(y_test, y_pred1)
f11=f1_score(y_test, y_pred1)
display(conf1)
print('Accuracy', acc1)
print('Recall', rec1)
print('Precision', pre1)
print('F1', f11)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-27-8ef748c59a1f> in <module>
      2 
      3 model1=DecisionTreeClassifier()
----> 4 model1.fit(X_train, y_train)
      5 y_pred1=model1.predict(X_test)
      6 conf1=confusion_matrix(y_test, y_pred1)

C:\MEGA\IronHack\python\lib\site-packages\sklearn\tree\_classes.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
    888         """
    889 
--> 890         super().fit(
    891             X, y,
    892             sample_weight=sample_weight,

C:\MEGA\IronHack\python\lib\site-packages\sklearn\tree\_classes.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
    154             check_X_params = dict(dtype=DTYPE, accept_sparse="csc")
    155             check_y_params = dict(ensure_2d=False, dtype=None)
--> 156             X, y = self._validate_data(X, y,
    157                                        validate_separately=(check_X_params,
    158                                                             check_y_params))

C:\MEGA\IronHack\python\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
    427                 # :(
    428                 check_X_params, check_y_params = validate_separately
--> 429                 X = check_array(X, **check_X_params)
    430                 y = check_array(y, **check_y_params)
    431             else:

C:\MEGA\IronHack\python\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
     70                           FutureWarning)
     71         kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72         return f(**kwargs)
     73     return inner_f
     74 

C:\MEGA\IronHack\python\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
    596                     array = array.astype(dtype, casting="unsafe", copy=False)
    597                 else:
--> 598                     array = np.asarray(array, order=order, dtype=dtype)
    599             except ComplexWarning:
    600                 raise ValueError("Complex data not supported\n"

C:\MEGA\IronHack\python\lib\site-packages\numpy\core\_asarray.py in asarray(a, dtype, order)
     81 
     82     """
---> 83     return array(a, dtype, copy=False, order=order)
     84 
     85 

C:\MEGA\IronHack\python\lib\site-packages\pandas\core\generic.py in __array__(self, dtype)
   1776 
   1777     def __array__(self, dtype=None) -> np.ndarray:
-> 1778         return np.asarray(self._values, dtype=dtype)
   1779 
   1780     def __array_wrap__(self, result, context=None):

C:\MEGA\IronHack\python\lib\site-packages\numpy\core\_asarray.py in asarray(a, dtype, order)
     81 
     82     """
---> 83     return array(a, dtype, copy=False, order=order)
     84 
     85 

ValueError: could not convert string to float: 'Sweden'
In [78]:
# Random forest

model1=RandomForestClassifier()
model1.fit(X_train, y_train)
y_pred1=model1.predict(X_test)
conf1=confusion_matrix(y_test, y_pred1)
acc1=accuracy_score(y_test, y_pred1)
rec1=recall_score(y_test, y_pred1)
pre1=precision_score(y_test, y_pred1)
f11=f1_score(y_test, y_pred1)
display(conf1)
print('Accuracy', acc1)
print('Recall', rec1)
print('Precision', pre1)
print('F1', f11)
array([[51,  1],
       [ 2,  8]], dtype=int64)
Accuracy 0.9516129032258065
Recall 0.8
Precision 0.8888888888888888
F1 0.8421052631578948
In [79]:
# Random forest balanced

model1=RandomForestClassifier(class_weight='balanced')
model1.fit(X_train, y_train)
y_pred1=model1.predict(X_test)
conf1=confusion_matrix(y_test, y_pred1)
acc1=accuracy_score(y_test, y_pred1)
rec1=recall_score(y_test, y_pred1)
pre1=precision_score(y_test, y_pred1)
f11=f1_score(y_test, y_pred1)
display(conf1)
print('Accuracy', acc1)
print('Recall', rec1)
print('Precision', pre1)
print('F1', f11)
array([[51,  1],
       [ 2,  8]], dtype=int64)
Accuracy 0.9516129032258065
Recall 0.8
Precision 0.8888888888888888
F1 0.8421052631578948
In [80]:
# adaboost

model1=AdaBoostClassifier()
model1.fit(X_train, y_train)
y_pred1=model1.predict(X_test)
conf1=confusion_matrix(y_test, y_pred1)
acc1=accuracy_score(y_test, y_pred1)
rec1=recall_score(y_test, y_pred1)
pre1=precision_score(y_test, y_pred1)
f11=f1_score(y_test, y_pred1)
display(conf1)
print('Accuracy', acc1)
print('Recall', rec1)
print('Precision', pre1)
print('F1', f11)
array([[51,  1],
       [ 5,  5]], dtype=int64)
Accuracy 0.9032258064516129
Recall 0.5
Precision 0.8333333333333334
F1 0.625
In [27]:
from xgboost import XGBClassifier
from catboost import CatBoostlassifier

#dont work
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-27-19c4b76ee8fa> in <module>
----> 1 from xgboost import XGBClassifier
      2 from catboost import CatBoostlassifier
      3 
      4 #dont work

ModuleNotFoundError: No module named 'xgboost'
not cool models

ChessUrl

In [81]:
knn=KNeighborsClassifier(5)
knn.fit(X_train,y_train)
y_pred1=knn.predict(X_test)
display(confusion_matrix(y_pred1,y_test))
display(accuracy_score(y_pred1,y_test))
array([[52,  6],
       [ 0,  4]], dtype=int64)
0.9032258064516129
In [82]:
lr=LogisticRegression(max_iter=10000)
lr.fit(X_train,y_train)
y_pred2=lr.predict(X_test)
display(confusion_matrix(y_pred2,y_test))
display(accuracy_score(y_pred2,y_test))
array([[50,  1],
       [ 2,  9]], dtype=int64)
0.9516129032258065
In [83]:
sv=SVC()
sv.fit(X_train,y_train)
y_pred3=sv.predict(X_test)
display(confusion_matrix(y_pred3,y_test))
display(accuracy_score(y_pred3,y_test))
array([[52,  8],
       [ 0,  2]], dtype=int64)
0.8709677419354839
In [84]:
nb=GaussianNB()
nb.fit(X_train,y_train)
y_pred4=nb.predict(X_test)
display(confusion_matrix(y_pred4,y_test))
display(accuracy_score(y_pred4,y_test))
array([[49,  1],
       [ 3,  9]], dtype=int64)
0.9354838709677419
In [ ]:
#wrap

Learnings

  • models are nice
  • a level of development of a country is predictable to 95%
  • we must all protect our property

image.png

if I was to start from scratch...

  • copy everything from my old notebook not to call a friend at 2h30 in the night to find the dataset
In [1]:
import plotly
plotly.offline.init_notebook_mode()